load libraries

library("tidyverse")
library("plyr")
library("dplyr")
library("ggplot2")
library("scales")
library("ggpubr")
library("gridExtra")
library("grid")
library("GGally")
library("vcfR") # for extracting genotype data from a vcf file
library("data.table")
library("stringr")
library("janitor")
library("plotly")
library("graphics")
library("quest")
library("cowplot")


#library("gmodels")
#library("rstatix")
#library("freqtables")
#library("broom")
#library("patchwork") # for gathering the plots
#library("fuzzyjoin") # to join tables based on a string in a column
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE,
                     fig.width = 10,
                      fig.asp = 0.4,
                      out.width = "100%")
#fig.width = 6,fig.asp = 0.8,out.width = "100%"

Variant mean depth

all samples

Next we will examine the mean depth for each of our variants. This is essentially the number of reads that have mapped to this position. The output we generated with vcftools is the mean of the read depth across all individuals - it is for both alleles at a position and is not partitioned between the reference and the alternative. First we read in the data.

depth_all <- read_delim("/Users/nuriteliash/Documents/GitHub/varroa-linkage-map/data/vcf_filter/Q40BIALLDP16HDP40mis.5Chr7/Q40BIALLDP16HDP40mis.5Chr7.ldepth.mean", delim = "\t",
           col_names = c("chr", "pos", "mean_depth", "var_depth"), skip = 1)

#a <- ggplot(depth_all, aes(mean_depth)) + geom_density(fill = "dodgerblue1", colour = "black", alpha = 0.3)
#a + theme_light()

summary(depth_all$mean_depth)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   14.65   17.92   19.09   19.39   20.51   40.85

Again take a moment to look at the data - mean_depth is our column of interest but note that you can also get a an idea of the variance in depth among individuals from the var_depth column. Once again, we will use ggplot to look at the distribution of read depths ### Mean depth by sex

depth_fem <- read_delim("/Users/nuriteliash/Documents/GitHub/varroa-linkage-map/data/vcf_filter/female.Q40BIALLDP16HDP40mis.5Chr7/female.Q40BIALLDP16HDP40mis.5Chr7.ldepth.mean", delim = "\t",
           col_names = c("chr", "pos", "mean_depth", "var_depth"), skip = 1) %>% 
  dplyr::mutate(sex ="Female")
  
depth_male <- read_delim("/Users/nuriteliash/Documents/GitHub/varroa-linkage-map/data/vcf_filter/male.Q40BIALLDP16HDP40mis.5Chr7/male.Q40BIALLDP16HDP40mis.5Chr7.ldepth.mean", delim = "\t",
           col_names = c("chr", "pos", "mean_depth", "var_depth"), skip = 1)%>% 
  dplyr::mutate(sex ="Male")

depth_sex = rbind(depth_fem,depth_male) %>% unite(site, c("chr", "pos"))
  
ggplot(depth_sex, aes(mean_depth, fill = sex)) + geom_density(colour = "black", alpha = 0.5) + theme_light() +
  ggtitle("Mean site depth, by sex")

ggplot(depth_sex, aes(var_depth, fill = sex)) + geom_density(colour = "black", alpha = 0.5) + theme_light()+
  ggtitle("Mean site depth variance, by sex")

the site’s depth is somewhat higher in females, compared to males.
the variance is similar.

Variant missingness

Next up we will look at the proportion of missingness at each variant. This is a measure of how many individuals lack a genotype at a call site.

miss_fem <- read_delim("/Users/nuriteliash/Documents/GitHub/varroa-linkage-map/data/vcf_filter/female.Q40BIALLDP16HDP40mis.5Chr7/female.Q40BIALLDP16HDP40mis.5Chr7.lmiss", delim = "\t",
                       col_names = c("chr", "pos", "nchr", "nfiltered", "nmiss", "fmiss"), skip = 1) %>% 
  dplyr::mutate(sex ="Female")

miss_male <- read_delim("/Users/nuriteliash/Documents/GitHub/varroa-linkage-map/data/vcf_filter/male.Q40BIALLDP16HDP40mis.5Chr7/male.Q40BIALLDP16HDP40mis.5Chr7.lmiss", delim = "\t",
                       col_names = c("chr", "pos", "nchr", "nfiltered", "nmiss", "fmiss"), skip = 1) %>% 
  dplyr::mutate(sex ="Male")

miss_sex = rbind(miss_fem,miss_male) %>% unite(site, c("chr", "pos"))
  
ggplot(miss_sex, aes(fmiss, fill = sex)) + geom_density(colour = "black", alpha = 0.5) + theme_light() +
  ggtitle("Variant missingness, by sex")

summary(miss_sex$fmiss)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.05357 0.41071 0.46237 0.46493 0.51786 0.71429

Variant quality

The first metric we will look at is the (Phred encoded) site quality. This is a measure of how much confidence we have in our variant calls. First of all, we read in the site quality report we generated using vcftools. We will use the read_delim command from the readr package (part of the the tidyverse) because it is more efficient for reading in large datafiles. It also allows us to set our own column names.

Take a look at the data when it is read in. You will see that for each site in our subsampled VCF, we have extracted the site quality score. Now we will plot the distribution of this quality using ggplot. Usually, the geom_density function works best, but you can use geom_histogram too.

qual_fem <- read_delim("/Users/nuriteliash/Documents/GitHub/varroa-linkage-map/data/vcf_filter/female.Q40BIALLDP16HDP40mis.5Chr7/female.Q40BIALLDP16HDP40mis.5Chr7.lqual", delim = "\t",
           col_names = c("chr", "pos", "qual"), skip = 1) %>% 
  dplyr::mutate(sex ="Female")

qual_male <-read_delim("/Users/nuriteliash/Documents/GitHub/varroa-linkage-map/data/vcf_filter/male.Q40BIALLDP16HDP40mis.5Chr7/male.Q40BIALLDP16HDP40mis.5Chr7.lqual", delim = "\t",
           col_names = c("chr", "pos", "qual"), skip = 1) %>% 
  dplyr::mutate(sex ="Male")
  
qual_sex = rbind(qual_fem,qual_male) %>% unite(site, c("chr", "pos"))
  
ggplot(qual_sex, aes(qual, fill = sex)) + geom_density(colour = "black", alpha = 0.5) + theme_light() +
  ggtitle("Variant quality, by sex")

summary(qual_male$qual)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##     40.48  33240.80  48198.10  52338.46  69672.30 191031.00
summary(qual_fem$qual)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##     40.48  33240.80  48198.10  52338.46  69672.30 191031.00

the quality is same for individuals, - its already normalized.

Load Variant Call Format (VCF) file.

Extract genotypes for each site and individual. The metadata for all samples can be found in here.

vcf <- read.vcfR("/Users/nuriteliash/Documents/GitHub/varroa-linkage-map/data/vcf_filter/Q40BIALLDP16HDP40mis.5Chr7/Q40BIALLDP16HDP40mis.5Chr7.recode.vcf", verbose = FALSE )
vcf
## ***** Object of Class vcfR *****
## 223 samples
## 7 CHROMs
## 35,169 variants
## Object size: 293.5 Mb
## 0 percent missing data
## *****        *****         *****
# extract the genotype for each site in each individual
gt <- extract.gt(vcf, element = "GT") 
gt <- as.data.frame(t(gt)) %>%
  rownames_to_column("ID")
#clean the ID names
gt$ID <- sub("_[^_]+$", "", gt$ID)

table <-  gt %>% 
  t() %>%
  as.data.frame() %>%
  row_to_names(row_number = 1) %>% 
  dplyr::select(contains(c("son", "dat", "fnd"))) # keep only adults of F0, F1 and F2 

# set the families (include only families with at least one adult F2)
family = grep("grndat|grnson",gt$ID, value=TRUE) %>%
  str_extract("[^_]+")  %>%
  unique()

# or, include all F2 samples, but indicate if they have an adult sister (may indicate if the F1 female was fertilized)
#family = grep("grn",gt$ID, value=TRUE) %>%
#  str_extract("[^_]+")  %>%
#  unique()

F1 male (0/0) x female (0/1)

# define a list to put all the data frames in
obs <- list()

for (fam in family) {
 
obs[[fam]] <- table %>%
  dplyr::select(starts_with(fam)) %>%
    dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "0/0")) %>% # force F0 female to be homo, like her son
  dplyr::filter_at(vars(matches("_son")), all_vars(. == "0/0")) %>%
  dplyr::filter_at(vars(matches("_dat")), all_vars(. == "0/1")) %>% 
  dplyr::select(contains("grn")) %>%
  rownames_to_column("site") %>%
  tidyr::pivot_longer(-site)  %>% 
  dplyr::rename(sample = name, gt = value) %>%
  mutate(sex = case_when(
   grepl("son", sample) ~ "Male",
    grepl("dat", sample) ~ "Female")) %>%
  dplyr::filter(sex == "Male") %>%
dplyr::mutate(site_status = case_when(
    grepl("0/1", gt) ~ "Normal",
    grepl("0/0", gt) ~ "Exception")) %>%
  na.omit()
}

F2_male_00_01 = do.call("rbind", obs) %>%  dplyr::select(c("site","sample","site_status")) 

### depth
F2_male_00_01_depth = depth_sex %>% 
  dplyr::filter(sex =="Male") %>% 
  full_join(F2_male_00_01, by = "site") %>% 
  na.omit() %>%
  dplyr::select(c("site","mean_depth", "sample", "site_status")) 

p_F2_male_00_01_depth_box = F2_male_00_01_depth %>% ggplot(aes(x = site_status, y = mean_depth)) +
    geom_boxplot()+ggtitle("Variant depth in F2 males, of F1 cross 0/0 x 0/1")
 
p_F2_male_00_01_depth_dens = F2_male_00_01_depth %>% ggplot(aes(mean_depth, fill = site_status)) +
 geom_density(colour = "black", alpha = 0.5) + theme_light() +
  ggtitle("Variant depth in F2 males, of F1 cross 0/0 x 0/1")

### quality
F2_male_00_01_qual = qual_sex %>% 
  dplyr::filter(sex =="Male") %>% 
  full_join(F2_male_00_01, by = "site") %>% 
  na.omit() %>%
  dplyr::select(c("site","qual", "sample", "site_status")) 

p_F2_male_00_01_qual_box = F2_male_00_01_qual %>% ggplot(aes(x = site_status, y = qual)) +
    geom_boxplot() + ggtitle("Variant quality in F2 males, of F1 cross 0/0 x 0/1") + theme_light() 

p_F2_male_00_01_qual = F2_male_00_01_qual %>% ggplot(aes(qual, fill = site_status)) +
 geom_density(colour = "black", alpha = 0.5) + theme_light() +
  ggtitle("Variant quality in F2 males, of F1 cross 0/0 x 0/1")

F1 male (1/1) x female (0/1)

# define a list to put all the data frames in
obs <- list()

for (fam in family) {
 
obs[[fam]] <- table %>%
  dplyr::select(starts_with(fam)) %>%
    dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "1/1")) %>% # force F0 female to be homo, like her son
  dplyr::filter_at(vars(matches("_son")), all_vars(. == "1/1")) %>%
  dplyr::filter_at(vars(matches("_dat")), all_vars(. == "0/1")) %>% 
  dplyr::select(contains("grn")) %>%
  rownames_to_column("site") %>%
  tidyr::pivot_longer(-site)  %>% 
  dplyr::rename(sample = name, gt = value) %>%
  mutate(sex = case_when(
   grepl("son", sample) ~ "Male",
    grepl("dat", sample) ~ "Female")) %>%
  dplyr::filter(sex == "Male") %>%
dplyr::mutate(site_status = case_when(
    grepl("0/1", gt) ~ "Normal",
    grepl("1/1", gt) ~ "Exception")) %>%
  na.omit()
}

F2_male_11_01 = do.call("rbind", obs) %>%  dplyr::select(c("site","sample","site_status")) 

### depth
F2_male_11_01_depth = depth_sex %>% 
  dplyr::filter(sex =="Male") %>% 
  full_join(F2_male_11_01, by = "site") %>% 
  na.omit() %>%
  dplyr::select(c("site","mean_depth", "sample", "site_status")) 

p_F2_male_11_01_depth_box = F2_male_11_01_depth %>% ggplot(aes(x = site_status, y = mean_depth)) +
    geom_boxplot()+ggtitle("Variant depth in F2 males, of F1 cross 1/1 x 0/1")
 
p_F2_male_11_01_depth_dens = F2_male_11_01_depth %>% ggplot(aes(mean_depth, fill = site_status)) +
 geom_density(colour = "black", alpha = 0.5) + theme_light() +
  ggtitle("Variant depth in F2 males, of F1 cross 1/1 x 0/1")

### quality
F2_male_11_01_qual = qual_sex %>% 
  dplyr::filter(sex =="Male") %>% 
  full_join(F2_male_11_01, by = "site") %>% 
  na.omit() %>%
  dplyr::select(c("site","qual", "sample", "site_status")) 

p_F2_male_11_01_qual_box = F2_male_11_01_qual %>% ggplot(aes(x = site_status, y = qual)) +
    geom_boxplot() + ggtitle("Variant quality in F2 males, of F1 cross 1/1 x 0/1") + theme_light() 

p_F2_male_11_01_qual = F2_male_11_01_qual %>% ggplot(aes(qual, fill = site_status)) +
 geom_density(colour = "black", alpha = 0.5) + theme_light() +
  ggtitle("Variant quality in F2 males, of F1 cross 1/1 x 0/1")

F1 male (0/1) x female (0/0)

# define a list to put all the data frames in
obs <- list()

for (fam in family) {
 
obs[[fam]] <- table %>%
  dplyr::select(starts_with(fam)) %>%
    dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "0/1")) %>% # force F0 female to be homo, like her son
  dplyr::filter_at(vars(matches("_son")), all_vars(. == "0/1")) %>%
  dplyr::filter_at(vars(matches("_dat")), all_vars(. == "0/0")) %>% 
  dplyr::select(contains("grn")) %>%
  rownames_to_column("site") %>%
  tidyr::pivot_longer(-site)  %>% 
  dplyr::rename(sample = name, gt = value) %>%
  mutate(sex = case_when(
   grepl("son", sample) ~ "Male",
    grepl("dat", sample) ~ "Female")) %>%
  dplyr::filter(sex == "Male") %>%
dplyr::mutate(site_status = case_when(
    grepl("0/0", gt) ~ "Normal",
    grepl("0/1", gt) ~ "Exception")) %>%
  na.omit()
}

F2_male_01_00 = do.call("rbind", obs) %>%  dplyr::select(c("site","sample","site_status")) 

### depth
F2_male_01_00_depth = depth_sex %>% 
  dplyr::filter(sex =="Male") %>% 
  full_join(F2_male_01_00, by = "site") %>% 
  na.omit() %>%
  dplyr::select(c("site","mean_depth", "sample", "site_status")) 

p_F2_male_01_00_depth_box = F2_male_01_00_depth %>% ggplot(aes(x = site_status, y = mean_depth)) +
    geom_boxplot()+ggtitle("Variant depth in F2 males, of F1 cross 0/1 x 0/0")
 
p_F2_male_01_00_depth_dens = F2_male_01_00_depth %>% ggplot(aes(mean_depth, fill = site_status)) +
 geom_density(colour = "black", alpha = 0.5) + theme_light() +
  ggtitle("Variant depth in F2 males, of F1 cross 0/1 x 0/0")

### quality
F2_male_01_00_qual = qual_sex %>% 
  dplyr::filter(sex =="Male") %>% 
  full_join(F2_male_01_00, by = "site") %>% 
  na.omit() %>%
  dplyr::select(c("site","qual", "sample", "site_status")) 

p_F2_male_01_00_qual_box = F2_male_01_00_qual %>% ggplot(aes(x = site_status, y = qual)) +
    geom_boxplot() + ggtitle("Variant quality in F2 males, of F1 cross 0/1 x 0/0") + theme_light() 

p_F2_male_01_00_qual = F2_male_01_00_qual %>% ggplot(aes(qual, fill = site_status)) +
 geom_density(colour = "black", alpha = 0.5) + theme_light() +
  ggtitle("Variant quality in F2 males, of F1 cross 0/1 x 0/0")

F1 male (0/1) x female (1/1)

# define a list to put all the data frames in
obs <- list()

for (fam in family) {
 
obs[[fam]] <- table %>%
  dplyr::select(starts_with(fam)) %>%
    dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "0/1")) %>% # force F0 female to be homo, like her son
  dplyr::filter_at(vars(matches("_son")), all_vars(. == "0/1")) %>%
  dplyr::filter_at(vars(matches("_dat")), all_vars(. == "1/1")) %>% 
  dplyr::select(contains("grn")) %>%
  rownames_to_column("site") %>%
  tidyr::pivot_longer(-site)  %>% 
  dplyr::rename(sample = name, gt = value) %>%
  mutate(sex = case_when(
   grepl("son", sample) ~ "Male",
    grepl("dat", sample) ~ "Female")) %>%
  dplyr::filter(sex == "Male") %>%
dplyr::mutate(site_status = case_when(
    grepl("1/1", gt) ~ "Normal",
    grepl("0/1", gt) ~ "Exception")) %>%
  na.omit()
}

F2_male_01_11 = do.call("rbind", obs) %>%  dplyr::select(c("site","sample","site_status")) 

### depth
F2_male_01_11_depth = depth_sex %>% 
  dplyr::filter(sex =="Male") %>% 
  full_join(F2_male_01_11, by = "site") %>% 
  na.omit() %>%
  dplyr::select(c("site","mean_depth", "sample", "site_status")) 

p_F2_male_01_11_depth_box = F2_male_01_11_depth %>% ggplot(aes(x = site_status, y = mean_depth)) +
    geom_boxplot()+ggtitle("Variant depth in F2 males, of F1 cross 0/1 x 1/1")
 
p_F2_male_01_11_depth_dens = F2_male_01_11_depth %>% ggplot(aes(mean_depth, fill = site_status)) +
 geom_density(colour = "black", alpha = 0.5) + theme_light() +
  ggtitle("Variant depth in F2 males, of F1 cross 0/1 x 1/1")

### quality
F2_male_01_11_qual = qual_sex %>% 
  dplyr::filter(sex =="Male") %>% 
  full_join(F2_male_01_11, by = "site") %>% 
  na.omit() %>%
  dplyr::select(c("site","qual", "sample", "site_status")) 

p_F2_male_01_11_qual_box = F2_male_01_11_qual %>% ggplot(aes(x = site_status, y = qual)) +
    geom_boxplot() + ggtitle("Variant quality in F2 males, of F1 cross 0/1 x 1/1") + theme_light() 

p_F2_male_01_11_qual = F2_male_01_11_qual %>% ggplot(aes(qual, fill = site_status)) +
 geom_density(colour = "black", alpha = 0.5) + theme_light() +
  ggtitle("Variant quality in F2 males, of F1 cross 0/1 x 1/1")

pool all sites together

F2_male = rbind(mutate(F2_male_00_01_qual, cross = "male_00_01"),
                mutate(F2_male_11_01_qual, cross = "male_11_01"), 
                mutate(F2_male_01_00_qual, cross = "male_01_00"),
                mutate(F2_male_01_11_qual, cross = "male_01_11")) %>%
  dplyr::select(c("site", "qual", "site_status","cross"))

F2_male %>% ggplot(aes(qual, fill = site_status)) +
 geom_density(colour = "black", alpha = 0.5) + theme_light() +
  ggtitle("Variant quality in F2 males") +
  facet_wrap(~cross, ncol = 1,scales = "free_y" ) +
 scale_x_continuous(n.breaks = 12) +
    theme(axis.text.x=element_text(angle=90,hjust=1)) 

plot quality in each cross:

legend <- get_legend(p_F2_male_00_01_qual)   # get the legend of the first one plot

# here the plots in a grid
prow <- plot_grid( p_F2_male_00_01_qual + theme(legend.position="none"),
           # here you add the percentage
           p_F2_male_11_01_qual + theme(legend.position="none")+ scale_y_continuous(),
           p_F2_male_01_00_qual + theme(legend.position="none")+ scale_y_continuous(),
          p_F2_male_01_11_qual + theme(legend.position="none")+ scale_y_continuous(),
         align = 'v',
           #labels = c("A", "B"),
           hjust = -1,
           nrow = 4)

# here you add the legend
plot_grid( prow, legend, rel_widths = c(1, .2))

# Draw density plots
grid.arrange(arrangeGrob(p_F2_male_00_01_qual, p_F2_male_11_01_qual, p_F2_male_01_00_qual, p_F2_male_01_11_qual,ncol = 1), heights = c(10, 1),top = grid::textGrob("all crosses", x = 0, hjust = 0))

# Draw box plots
grid.arrange(arrangeGrob(p_F2_male_00_01_qual_box, p_F2_male_11_01_qual_box, p_F2_male_01_00_qual_box, p_F2_male_01_11_qual_box ,ncol = 1), heights = c(10, 1),top = grid::textGrob("all crosses", x = 0, hjust = 0))

plot depth in each cross:

# Draw density plots
grid.arrange(arrangeGrob(p_F2_male_00_01_depth_dens, p_F2_male_11_01_depth_dens, p_F2_male_01_00_depth_dens, p_F2_male_01_11_depth_dens,ncol = 1), heights = c(10, 1),top = grid::textGrob("all crosses", x = 0, hjust = 0))

# Draw box plots
grid.arrange(arrangeGrob(p_F2_male_00_01_depth_box, p_F2_male_11_01_depth_box, p_F2_male_01_00_depth_box, p_F2_male_01_11_depth_box ,ncol = 1), heights = c(10, 1),top = grid::textGrob("all crosses", x = 0, hjust = 0))

Individual based statistics

As well as a our per variant statistics we generated earlier, we also calculated some individual metrics too. WE can look at the distribution of these to get an idea whether some of our individuals have not sequenced or mapped as well as others. This is good practice to do with a new dataset. A lot of these statistics can be compared to other measures generated from the data (i.e. principal components as a measure of population structure) to see if they drive any apparent patterns in the data.

Mean depth per individual

First we will look at the distribution of mean depth among individuals. We read the data in with read_delim:

# NO filtering
ind_depth <- read_delim("/Users/nuriteliash/Documents/GitHub/varroa-linkage-map/data/vcf_filter/Q40BIALLDP16HDP40mis.5Chr7/Q40BIALLDP16HDP40mis.5Chr7.idepth", delim = "\t",
                        col_names = c("ind", "nsites", "depth"), skip = 1) %>%
            mutate(sex = case_when(
                   grepl("son", ind) ~ "Male",
                   grepl("dat|fnd", ind) ~ "Female")) %>%
            na.omit()

ind_depth$ind <- sub("_[^_]+$", "", ind_depth$ind)

#Then we plot the distribution as a histogram using ggplot and geom_hist.
ggplot(ind_depth, aes(depth)) + geom_histogram(fill = "dodgerblue1", colour = "black", alpha = 0.3)+ 
  theme_light()

ggplot(ind_depth, aes(depth)) + geom_density(fill = "dodgerblue1", colour = "black", alpha = 0.3)+
  theme_light()

p_ind_depth = ind_depth %>% filter(grepl("grnson|grndat", ind)) %>%
   ggplot(aes(y = depth, x = ind, color = sex, label = ind)) + geom_point() + 
  theme_classic() +  theme(axis.text.x = element_blank())


ggplotly(p_ind_depth)
#p_ind_depth + 
 # geom_label() +
 # geom_text(angle = 45,hjust = 0, nudge_x = 0.05) 

summary(ind_depth$depth)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   6.511  12.495  17.810  19.245  25.794  38.771

individuals with high depth (>30)

ind_depth %>% dplyr::filter(depth > 30) 
## # A tibble: 15 × 4
##    ind               nsites depth sex   
##    <chr>              <dbl> <dbl> <chr> 
##  1 133_133_fnd        35169  34.9 Female
##  2 476_477b_grnson    35169  30.0 Male  
##  3 46_47c_grndat      35169  32.4 Female
##  4 302_303_dat        35169  34.6 Female
##  5 478_478_fnd        35169  34.3 Female
##  6 498_499_dat        35169  38.8 Female
##  7 300_300a_son       35169  32.0 Male  
##  8 43_44_dat          35169  30.7 Female
##  9 284_284_fnd        35169  37.9 Female
## 10 564_565-1b_grnson  35169  34.0 Male  
## 11 458_459a_grnson    35168  31.6 Male  
## 12 476_476_fnd        35169  30.4 Female
## 13 133_133a_son       35169  30.8 Male  
## 14 478_479-1_dat      35169  31.6 Female
## 15 48_49_dat          35169  30.9 Female

individuals with low depth (<10)

 ind_depth %>% filter(depth < 10 ) 
## # A tibble: 17 × 4
##    ind               nsites depth sex   
##    <chr>              <dbl> <dbl> <chr> 
##  1 322_322_fnd        34061  7.42 Female
##  2 300_301_dat        33914  7.42 Female
##  3 596_597a_grnson    33837  8.29 Male  
##  4 266_266a_son       32842  6.51 Male  
##  5 110_110_fnd        33728  7.32 Female
##  6 187_188a_grnson    33632  7.15 Male  
##  7 498_498_fnd        34574  8.96 Female
##  8 300_301a_grnson    34099  8.67 Male  
##  9 177_178b_grnson    33963  7.04 Male  
## 10 240_241c_grnson    34833  9.36 Male  
## 11 266_266_fnd        33878  7.21 Female
## 12 478_479-2b_grnson  34677  9.02 Male  
## 13 564_565-2a_grndat  34648  9.74 Female
## 14 177_178a_grndat    33541  6.62 Female
## 15 43_43_fnd          34397  7.87 Female
## 16 478_479-2a_grndat  34710  8.77 Female
## 17 502_502_fnd        34632  9.61 Female

Proportion of missing data per individual

Next we will look at the proportion of missing data per individual. We read in the data below:

ind_miss  <- read_delim("/Users/nuriteliash/Documents/GitHub/varroa-linkage-map/data/vcf_filter/Q40BIALLDP16HDP40mis.5Chr7/Q40BIALLDP16HDP40mis.5Chr7.imiss", delim = "\t", col_names = c("ind", "ndata", "nfiltered", "nmiss", "fmiss"), skip = 1) %>%
mutate(sex = case_when(
                   grepl("son", ind) ~ "Male",
                   grepl("dat|fnd", ind) ~ "Female")) %>%
            na.omit()

ind_miss$ind <- sub("_[^_]+$", "", ind_miss$ind)


p_ind_miss = ind_miss %>% filter(grepl("grnson|grndat", ind)) %>%
  ggplot(aes(y = fmiss, x = ind, color = sex)) + geom_point() + theme_classic()
ggplotly(p_ind_miss)
summary(ind_miss$fmiss)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0967  0.1677  0.3899  0.4512  0.7280  0.9574

individuals with high missingness (>0.7)

  ind_miss %>% dplyr::filter(fmiss > 0.7) 
## # A tibble: 39 × 6
##    ind             ndata nfiltered nmiss fmiss sex   
##    <chr>           <dbl>     <dbl> <dbl> <dbl> <chr> 
##  1 596_596_fnd     35169         0 29393 0.836 Female
##  2 57_58b_grnson   35169         0 25086 0.713 Male  
##  3 57_57_fnd       35169         0 26129 0.743 Female
##  4 57_57a_son      35169         0 29440 0.837 Male  
##  5 43_43a_son      35169         0 25215 0.717 Male  
##  6 502_503b_grnson 35169         0 26472 0.753 Male  
##  7 426_426a_son    35169         0 29759 0.846 Male  
##  8 412_413a_grnson 35169         0 26609 0.757 Male  
##  9 322_323_dat     35169         0 29327 0.834 Female
## 10 322_322_fnd     35169         0 32729 0.931 Female
## # … with 29 more rows

individuals with low missingness (<0.15)

  ind_miss %>% filter(fmiss < 0.15 ) 
## # A tibble: 26 × 6
##    ind             ndata nfiltered nmiss  fmiss sex   
##    <chr>           <dbl>     <dbl> <dbl>  <dbl> <chr> 
##  1 63_64_dat       35169         0  3509 0.0998 Female
##  2 63_63a_son      35169         0  4700 0.134  Male  
##  3 400_400a_son    35169         0  4565 0.130  Male  
##  4 57_58_dat       35169         0  3484 0.0991 Female
##  5 338_339a_grndat 35169         0  3783 0.108  Female
##  6 338_338_fnd     35169         0  3492 0.0993 Female
##  7 46_47_dat       35169         0  5127 0.146  Female
##  8 110_111c_grnson 35169         0  4033 0.115  Male  
##  9 177_177a_son    35169         0  3901 0.111  Male  
## 10 177_178_dat     35169         0  4685 0.133  Female
## # … with 16 more rows

Sample yield

sample_yield <- read_delim("/Users/nuriteliash/Documents/GitHub/varroa-linkage-map/data/sample_yield.csv", delim = ",") %>%
mutate(sex = case_when(
                   grepl("son", ind) ~ "Male",
                   grepl("dat|fnd", ind) ~ "Female")) %>%
            na.omit()

p_sample_yield = sample_yield %>% filter(grepl("grnson|grndat", ind)) %>%
   ggplot(aes(y = conc, x = ind, color = sex, label = ind)) + geom_point() + 
  theme_classic() +  theme(axis.text.x = element_blank())

ggplotly(p_sample_yield)
 sample_yield %>% ggplot(aes(x = sex, y = conc)) +
    geom_boxplot() + ggtitle("Sample yield")

males have lower yield compare to females

correlate sample yield with depth and missingness

samples_param = sample_yield %>% left_join(ind_miss, by = "ind") %>% left_join(ind_depth, by ="ind") %>%
  dplyr::select(c("ind", "sex", "Extraction_date","conc","fmiss","depth")) %>%
  mutate(conc = as.numeric(conc)) %>%
    mutate(fmiss = as.numeric(fmiss)) %>%
      mutate(depth = as.numeric(depth))

df = samples_param %>%
  dplyr::select(c("conc","fmiss","depth"))

pairs(~ conc + fmiss + depth, data = df, upper.panel = NULL)

No correlation between sample’s depth, or missingnes, And the concentration upon extraction.

 samples_param %>% ggscatter(x = "conc", y = "fmiss", color = "sex")

 samples_param %>% ggscatter(x = "conc", y = "depth", color = "sex")

lower concentration for male samples

do abnormal males have lower DNA concentration?

abnorm_males = tibble(ind = c("240_241c_grnson", "400_401a_grnson","412_413a_grnson", "426_427b_grnson", "458_459a_grnson", "46_47d_grnson"), sex = "Male", normality = "abnormal")

 sample_normality = samples_param %>%
  left_join(abnorm_males, by = "ind") %>% 
    dplyr::select(-sex.y) %>% 
    replace(is.na(.), "normal") %>%
    dplyr::rename(sex = sex.x) %>%
    unite("type", sex,normality, remove = FALSE) 
#sample_normality %>% 
#  dplyr::filter(sex =="Male") %>%
 #pivot_longer(c(conc, fmiss, depth)) %>% 
  #ggplot(aes( y = value, color = type )) +
   # geom_boxplot()+
  #ggtitle("Sample quality") + 
 # facet_wrap(~name, scales = "free_y") +
  #theme_bw() +
  #theme(axis.text.x = element_blank()) 

sample_normality %>% 
 # dplyr::filter(sex =="Male") %>%
 pivot_longer(c(conc, fmiss, depth)) %>% 
  ggplot(aes( y = value, x = type, color = type )) +
    geom_boxplot()+
  ggtitle("Sample quality") + 
  facet_wrap(~name, scales = "free_y") +
  theme_bw() +
  theme(axis.text.x = element_blank()) +theme(axis.text.x = element_blank(),
        axis.title.x =element_blank() ) +
  geom_jitter(alpha = 0.5)

p =  sample_normality %>% 
  #dplyr::filter(sex =="Male") %>%
   ggscatter(x = "depth", y = "fmiss", color = "type")

ggplotly(p)

abnormal males dont have exceptional yield, depth or missingness.

fraction of low coverage sites in abnormal males? They could have ok coverage on average but it may be uneven

dp = extract.gt(vcf, element = "DP", as.numeric =TRUE)

dp <- as.data.frame(t(dp)) %>%
    rownames_to_column("ind") 

#clean the ID names
dp$ind <- sub("_[^_]+$", "", dp$ind)

table_dp <-  dp %>% 
  t() %>%
  as.data.frame() %>%
  row_to_names(row_number = 1) %>% 
  dplyr::select(contains(c("son", "dat", "fnd"))) # keep only adults of F0, F1 and F2 

summary(depth_all$mean_depth)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   14.65   17.92   19.09   19.39   20.51   40.85
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 14.65   17.92   19.09   19.39   20.51   40.85 

based on the quantiles values of the mean depth of individuals, I count the sites with:
- “low coverage” (<Q1) 17.92.
- “high coverage” (>Q3) 20.51.
then we calculate the frequency of low and high coverage sites, per individual

dp_df <- dp %>%
  column_to_rownames("ind") %>%
       mutate(total = rowSums(across(where(is.numeric)) > 0, na.rm = TRUE)) %>% 
      mutate(low_dp = rowSums(across(where(is.numeric)) < 17.92, na.rm = TRUE)) %>%
      mutate(high_dp = rowSums(across(where(is.numeric)) > 20.51, na.rm = TRUE)) %>% 
       mutate(low_freq = low_dp/total) %>%
       mutate(high_freq = high_dp/total) %>%
    dplyr::select(-contains("NW")) %>%
  rownames_to_column("ind") %>%
  inner_join(sample_normality, by = "ind")

p_box_low = dp_df %>%
    # dplyr::filter(sex =="Male") %>%
 ggplot(aes(x = type, y = low_freq, color = type ,text = ind)) +
    geom_boxplot()+ggtitle("Frequency of low coverage sites")+  
  theme(axis.text.x = element_blank(),
        axis.title.x =element_blank() ) +
  geom_jitter()+ theme_bw() 

p_box_high = dp_df %>%
    # dplyr::filter(sex =="Male") %>%
 ggplot(aes(x = type, y = high_freq, color = type ,text = ind)) +
    geom_boxplot()+ggtitle("Frequency of HIGH coverage sites")+  
  theme(axis.text.x = element_blank(),
        axis.title.x =element_blank() ) +
  geom_jitter()  + theme_bw() 


ggplotly(p_box_low, tooltip = "text")
ggplotly(p_box_high, tooltip = "text")

correlation btw frequcny of coverage sites, and mean_depth

dp_df %>%
       dplyr::filter(sex =="Male") %>%
  ggscatter(x = "depth", y = "low_freq", color = "type")